library(tidyverse)
library(ggplot2)
library(lavaan)
library(car)
library(caret)
library(ggRandomForests)
library(VSURF)
library(glmnet)
library(Boruta)
library(doParallel)Barriers in healthcare Utilization (VSURF)
Data set
This data set is from the 2015 Asian American Quality of Life survey. Participants are from Austin, Texas.
Input data set
qol <- read_csv("AAQoL.csv") |> mutate(across(where(is.character), ~as.factor(.x))) |>
mutate(`English Difficulties`=relevel(`English Difficulties`,ref="Not at all"),
`English Speaking`=relevel(`English Speaking`,ref="Not at all"),
Ethnicity = relevel(Ethnicity,ref="Chinese"),
Religion=relevel(Religion,ref="None")) |>
mutate(Income_median = case_match(Income,"$0 - $9,999"~"Below",
"$10,000 - $19,999" ~"Below",
"$20,000 - $29,999"~"Below",
"$30,000 - $39,999"~"Below",
"$40,000 - $49,999"~"Below",
"$50,000 - $59,999"~"Below",
"$60,000 - $69,999"~"Above",
"$70,000 and over"~"Above",
.default=Income)) |>
mutate(Income_median = factor(Income_median, levels=c("Below","Above"))) |>
mutate(across(`Familiarity with America`:`Familiarity with Ethnic Origin`,~factor(.x,levels=c("Very low","Low", "High", "Very high"))),
across(`Identify Ethnically`,~factor(.x,levels=c("Not at all","Not very close","Somewhat close","Very close"))),
across(`Belonging`,~factor(.x,levels=c("Not at all","Not very much","Somewhat","Very much"))),
`Primary Language` = as.factor(`Primary Language`))New names:
Rows: 2609 Columns: 231
── Column specification
──────────────────────────────────────────────────────── Delimiter: "," chr
(190): Gender, Ethnicity, Marital Status, No One, Spouse, Children, Gran... dbl
(41): Survey ID, Age, Education Completed, Household Size, Grandparent,...
ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
Specify the column types or set `show_col_types = FALSE` to quiet this message.
• `Other` -> `Other...17`
• `Other` -> `Other...89`
qol |> DT::datatable()Warning in instance$preRenderHook(instance): It seems your data is too big for
client-side DataTables. You may consider server-side processing:
https://rstudio.github.io/DT/server.html
Unmet Health Need
rfdata <- qol |> select(`Unmet Health Need`,Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `US Born`:`Discrimination`,`Health Insurance`,`Dental Insurance`) |>
na.omit() |>
as.data.frame() |>
rename_with(make.names)
imbal <- ROSE::ROSE(Unmet.Health.Need~.,
data=rfdata,
seed=3)$data
# VSURF(Folkmedicine~.,imbal,na.action="na.omit",parallel=T,verbose=F)->vsurf.mod
VSURF(Unmet.Health.Need~.,imbal,na.action="na.omit",parallel=T,verbose=F)->vsurf.modWarning in VSURF.formula(Unmet.Health.Need ~ ., imbal, na.action = "na.omit", : VSURF with a formula-type call outputs selected variables
which are indices of the input matrix based on the formula:
you may reorder these to get indices of the original data
vsurf.mod |> summary()
VSURF computation time: 23.3 secs
VSURF selected:
18 variables at thresholding step (in 5 secs)
15 variables at interpretation step (in 2.9 secs)
14 variables at prediction step (in 15.4 secs)
VSURF ran in parallel on a PSOCK cluster and used 15 cores
names(rfdata[,-1])[vsurf.mod$varselect.pred] [1] "English.Speaking" "Dental.Insurance"
[3] "Religion" "Discrimination"
[5] "Ethnicity" "English.Difficulties"
[7] "Income_median" "Belonging"
[9] "Familiarity.with.America" "Identify.Ethnically"
[11] "Age" "Familiarity.with.Ethnic.Origin"
[13] "Gender" "Full.Time.Employment"
names(rfdata[,-1])[vsurf.mod$varselect.interp] [1] "English.Speaking" "Dental.Insurance"
[3] "Religion" "Discrimination"
[5] "Ethnicity" "English.Difficulties"
[7] "Income_median" "Belonging"
[9] "Familiarity.with.America" "Identify.Ethnically"
[11] "Age" "Familiarity.with.Ethnic.Origin"
[13] "Duration.of.Residency" "Gender"
[15] "Full.Time.Employment"
plot(vsurf.mod)
vsurf.mod$mean.perf[1] 0.06919959
Importance
vi<- data.frame(Variable=names(rfdata[,-1])[vsurf.mod$imp.mean.dec.ind],
Importance = vsurf.mod$imp.mean.dec,
sd_Importance = vsurf.mod$imp.sd.dec
)|>
mutate(fill = case_when(Variable=="Ethnicity"~"red",
.default="black"))
vi |> mutate(across(Importance:sd_Importance,~round(.x,5))) Variable Importance sd_Importance fill
1 English.Speaking 0.10924 0.00212 black
2 Dental.Insurance 0.08979 0.00189 black
3 Religion 0.08374 0.00083 black
4 Discrimination 0.07910 0.00122 black
5 Ethnicity 0.07475 0.00139 red
6 English.Difficulties 0.06901 0.00136 black
7 Income_median 0.05219 0.00187 black
8 Belonging 0.05137 0.00077 black
9 Familiarity.with.America 0.03975 0.00081 black
10 Identify.Ethnically 0.03611 0.00081 black
11 Age 0.03282 0.00108 black
12 Familiarity.with.Ethnic.Origin 0.03249 0.00082 black
13 Duration.of.Residency 0.02983 0.00079 black
14 Gender 0.02518 0.00084 black
15 Full.Time.Employment 0.02381 0.00081 black
16 Primary.Language 0.01325 0.00053 black
17 Health.Insurance 0.01282 0.00050 black
18 US.Born 0.00268 0.00014 black
importance_plot <- ggplot(vi, aes(x = reorder(Variable, Importance), y = Importance, fill=fill))+
geom_bar(stat = "identity",alpha=0.4) +
geom_errorbar(aes(ymin=Importance-sd_Importance, ymax = Importance+sd_Importance))+
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
scale_fill_manual(values=c("black","red"),
guide="none")
plot(importance_plot)
ggsave(filename = "VSURF_importance_unmethealth.png", width=12, height=8,units="in")Unmet Dental Needs
rfdata <- qol |> select(`Unmet Dental Needs`,Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `US Born`:`Discrimination`,`Health Insurance`,`Dental Insurance`) |>
na.omit() |>
as.data.frame() |>
rename_with(make.names)
imbal <- ROSE::ROSE(Unmet.Dental.Needs~.,
data=rfdata,
seed=3)$data
# VSURF(Folkmedicine~.,imbal,na.action="na.omit",parallel=T,verbose=F)->vsurf.mod
VSURF(Unmet.Dental.Needs~.,imbal,na.action="na.omit",parallel=T,verbose=F)->vsurf.modWarning in VSURF.formula(Unmet.Dental.Needs ~ ., imbal, na.action = "na.omit", : VSURF with a formula-type call outputs selected variables
which are indices of the input matrix based on the formula:
you may reorder these to get indices of the original data
vsurf.mod |> summary()
VSURF computation time: 16.6 secs
VSURF selected:
18 variables at thresholding step (in 5.1 secs)
17 variables at interpretation step (in 3 secs)
1 variables at prediction step (in 8.6 secs)
VSURF ran in parallel on a PSOCK cluster and used 15 cores
names(rfdata[,-1])[vsurf.mod$varselect.pred][1] "Dental.Insurance"
names(rfdata[,-1])[vsurf.mod$varselect.interp] [1] "Dental.Insurance" "Religion"
[3] "Ethnicity" "English.Speaking"
[5] "Income_median" "Familiarity.with.America"
[7] "English.Difficulties" "Belonging"
[9] "Discrimination" "Age"
[11] "Identify.Ethnically" "Duration.of.Residency"
[13] "Familiarity.with.Ethnic.Origin" "Health.Insurance"
[15] "Gender" "Primary.Language"
[17] "Full.Time.Employment"
plot(vsurf.mod)
vsurf.mod$mean.perf[1] 0.09077744
Importance
vi<- data.frame(Variable=names(rfdata[,-1])[vsurf.mod$imp.mean.dec.ind],
Importance = vsurf.mod$imp.mean.dec,
sd_Importance = vsurf.mod$imp.sd.dec
)|>
mutate(fill = case_when(Variable=="Ethnicity"~"red",
.default="black"))
vi |> mutate(across(Importance:sd_Importance,~round(.x,5))) Variable Importance sd_Importance fill
1 Dental.Insurance 0.10495 0.00201 black
2 Religion 0.07101 0.00098 black
3 Ethnicity 0.06735 0.00116 red
4 English.Speaking 0.06427 0.00153 black
5 Income_median 0.05603 0.00170 black
6 Familiarity.with.America 0.05383 0.00091 black
7 English.Difficulties 0.05107 0.00106 black
8 Belonging 0.04901 0.00111 black
9 Discrimination 0.04897 0.00125 black
10 Age 0.03837 0.00072 black
11 Identify.Ethnically 0.03301 0.00101 black
12 Duration.of.Residency 0.03009 0.00052 black
13 Familiarity.with.Ethnic.Origin 0.02884 0.00074 black
14 Health.Insurance 0.02815 0.00098 black
15 Gender 0.02530 0.00073 black
16 Primary.Language 0.02197 0.00058 black
17 Full.Time.Employment 0.01837 0.00046 black
18 US.Born 0.00235 0.00018 black
importance_plot <- ggplot(vi, aes(x = reorder(Variable, Importance), y = Importance, fill=fill))+
geom_bar(stat = "identity",alpha=0.4) +
geom_errorbar(aes(ymin=Importance-sd_Importance, ymax = Importance+sd_Importance))+
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
scale_fill_manual(values=c("black","red"),
guide="none")
plot(importance_plot)
ggsave(filename = "VSURF_importance_unmetdental.png", width=12, height=8,units="in")Physical Check-up
rfdata <- qol |>
select(`Physical Check-up`,Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `US Born`:`Discrimination`,`Health Insurance`,`Dental Insurance`) %>%
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`) |>
as.data.frame() |>
rename_with(make.names)
imbal <- ROSE::ROSE(Physical.Check.up~.,
data=rfdata,
seed=3)$data
VSURF(Physical.Check.up~.,imbal,na.action="na.omit",parallel=T,verbose=F)->vsurf.modWarning in VSURF.formula(Physical.Check.up ~ ., imbal, na.action = "na.omit", : VSURF with a formula-type call outputs selected variables
which are indices of the input matrix based on the formula:
you may reorder these to get indices of the original data
# VSURF(Physical.Check.up~.,rfdata,na.action="na.omit",parallel=T,verbose=F)->vsurf.mod
vsurf.mod |> summary()
VSURF computation time: 24 secs
VSURF selected:
18 variables at thresholding step (in 5.7 secs)
14 variables at interpretation step (in 3.3 secs)
12 variables at prediction step (in 15 secs)
VSURF ran in parallel on a PSOCK cluster and used 15 cores
names(rfdata[,-1])[vsurf.mod$varselect.pred] [1] "Duration.of.Residency" "Dental.Insurance"
[3] "Ethnicity" "Health.Insurance"
[5] "Religion" "EnglishDiff"
[7] "EnglishSpeak" "Familiarity.with.Ethnic.Origin"
[9] "Belonging" "Familiarity.with.America"
[11] "Identify.Ethnically" "Gender"
names(rfdata[,-1])[vsurf.mod$varselect.interp] [1] "Duration.of.Residency" "Dental.Insurance"
[3] "Ethnicity" "Health.Insurance"
[5] "Religion" "Age"
[7] "EnglishDiff" "EnglishSpeak"
[9] "Familiarity.with.Ethnic.Origin" "Belonging"
[11] "Familiarity.with.America" "Identify.Ethnically"
[13] "Income_median" "Gender"
plot(vsurf.mod)
vsurf.mod$mean.perf[1] 0.1762462
Importance
vi<- data.frame(Variable=names(rfdata[,-1])[vsurf.mod$imp.mean.dec.ind],
Importance = vsurf.mod$imp.mean.dec,
sd_Importance = vsurf.mod$imp.sd.dec
)|>
mutate(fill = case_when(Variable=="Ethnicity"~"red",
.default="black"))
vi |> mutate(across(Importance:sd_Importance,~round(.x,5))) Variable Importance sd_Importance fill
1 Duration.of.Residency 0.05958 0.00108 black
2 Dental.Insurance 0.05820 0.00115 black
3 Ethnicity 0.04682 0.00101 red
4 Health.Insurance 0.03525 0.00079 black
5 Religion 0.03338 0.00078 black
6 Age 0.03318 0.00078 black
7 EnglishDiff 0.03246 0.00075 black
8 EnglishSpeak 0.02352 0.00060 black
9 Familiarity.with.Ethnic.Origin 0.02108 0.00049 black
10 Belonging 0.02087 0.00047 black
11 Familiarity.with.America 0.02022 0.00040 black
12 Identify.Ethnically 0.01734 0.00059 black
13 Income_median 0.01670 0.00097 black
14 Gender 0.01657 0.00060 black
15 Discrimination 0.01428 0.00057 black
16 Employment 0.01149 0.00032 black
17 Primary.Language 0.01130 0.00061 black
18 US.Born 0.00658 0.00025 black
importance_plot <- ggplot(vi, aes(x = reorder(Variable, Importance), y = Importance, fill=fill))+
geom_bar(stat = "identity",alpha=0.4) +
geom_errorbar(aes(ymin=Importance-sd_Importance, ymax = Importance+sd_Importance))+
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
scale_fill_manual(values=c("black","red"),
guide="none")
plot(importance_plot)
ggsave(filename = "VSURF_importance_PC_ROSE.png", width=12, height=8,units="in")Dental Check-up
rfdata <- qol |> select(`Dentist Check-up`,Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `US Born`:`Discrimination`,`Health Insurance`,`Dental Insurance`) |>
na.omit() |>
as.data.frame() |>
rename_with(make.names)
imbal <- ROSE::ROSE(Dentist.Check.up~.,
data=rfdata,
seed=3)$data
VSURF(Dentist.Check.up~.,imbal,na.action="na.omit",parallel=T,verbose=F)->vsurf.modWarning in VSURF.formula(Dentist.Check.up ~ ., imbal, na.action = "na.omit", : VSURF with a formula-type call outputs selected variables
which are indices of the input matrix based on the formula:
you may reorder these to get indices of the original data
# VSURF(Dentist.Check.up~.,rfdata,na.action="na.omit",parallel=T,verbose=F)->vsurf.mod
vsurf.mod |> summary()
VSURF computation time: 18.9 secs
VSURF selected:
18 variables at thresholding step (in 5.2 secs)
12 variables at interpretation step (in 3.1 secs)
10 variables at prediction step (in 10.6 secs)
VSURF ran in parallel on a PSOCK cluster and used 15 cores
names(rfdata[,-1])[vsurf.mod$varselect.pred] [1] "Dental.Insurance" "Duration.of.Residency"
[3] "Ethnicity" "English.Difficulties"
[5] "Income_median" "English.Speaking"
[7] "Familiarity.with.Ethnic.Origin" "Identify.Ethnically"
[9] "Belonging" "Familiarity.with.America"
names(rfdata[,-1])[vsurf.mod$varselect.interp] [1] "Dental.Insurance" "Duration.of.Residency"
[3] "Ethnicity" "Religion"
[5] "English.Difficulties" "Income_median"
[7] "English.Speaking" "Familiarity.with.Ethnic.Origin"
[9] "Age" "Identify.Ethnically"
[11] "Belonging" "Familiarity.with.America"
plot(vsurf.mod)
vsurf.mod$mean.perf[1] 0.1694799
Importance
vi<- data.frame(Variable=names(rfdata[,-1])[vsurf.mod$imp.mean.dec.ind],
Importance = vsurf.mod$imp.mean.dec,
sd_Importance = vsurf.mod$imp.sd.dec
)|>
mutate(fill = case_when(Variable=="Ethnicity"~"red",
.default="black"))
vi |> mutate(across(Importance:sd_Importance,~round(.x,5))) Variable Importance sd_Importance fill
1 Dental.Insurance 0.10815 0.00106 black
2 Duration.of.Residency 0.05527 0.00095 black
3 Ethnicity 0.04367 0.00062 red
4 Religion 0.03888 0.00067 black
5 English.Difficulties 0.02921 0.00065 black
6 Income_median 0.02900 0.00076 black
7 English.Speaking 0.02480 0.00077 black
8 Familiarity.with.Ethnic.Origin 0.01971 0.00063 black
9 Age 0.01905 0.00053 black
10 Identify.Ethnically 0.01773 0.00051 black
11 Belonging 0.01660 0.00037 black
12 Familiarity.with.America 0.01609 0.00049 black
13 Discrimination 0.01418 0.00066 black
14 Health.Insurance 0.01311 0.00059 black
15 Gender 0.01139 0.00043 black
16 Full.Time.Employment 0.00931 0.00044 black
17 Primary.Language 0.00675 0.00043 black
18 US.Born 0.00321 0.00025 black
importance_plot <- ggplot(vi, aes(x = reorder(Variable, Importance), y = Importance, fill=fill))+
geom_bar(stat = "identity",alpha=0.4) +
geom_errorbar(aes(ymin=Importance-sd_Importance, ymax = Importance+sd_Importance))+
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
scale_fill_manual(values=c("black","red"),
guide="none")
plot(importance_plot)
ggsave(filename = "VSURF_importance_Dc_ROSE.png", width=12, height=8,units="in")Folkmedicine
rfdata <- qol |> select(`Folkmedicine`,Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `US Born`:`Discrimination`,`Health Insurance`,`Dental Insurance`) |>
na.omit() |>
as.data.frame() |>
rename_with(make.names)
imbal <- ROSE::ROSE(Folkmedicine~.,
data=rfdata,
seed=3)$data
# VSURF(Folkmedicine~.,imbal,na.action="na.omit",parallel=T,verbose=F)->vsurf.mod
VSURF(Folkmedicine~.,imbal,na.action="na.omit",parallel=T,verbose=F)->vsurf.modWarning in VSURF.formula(Folkmedicine ~ ., imbal, na.action = "na.omit", : VSURF with a formula-type call outputs selected variables
which are indices of the input matrix based on the formula:
you may reorder these to get indices of the original data
vsurf.mod |> summary()
VSURF computation time: 27.2 secs
VSURF selected:
18 variables at thresholding step (in 5.3 secs)
16 variables at interpretation step (in 3.2 secs)
15 variables at prediction step (in 18.7 secs)
VSURF ran in parallel on a PSOCK cluster and used 15 cores
names(rfdata[,-1])[vsurf.mod$varselect.pred] [1] "Ethnicity" "English.Speaking"
[3] "Religion" "Age"
[5] "English.Difficulties" "Duration.of.Residency"
[7] "Familiarity.with.America" "Familiarity.with.Ethnic.Origin"
[9] "Belonging" "Full.Time.Employment"
[11] "Identify.Ethnically" "Gender"
[13] "Income_median" "Primary.Language"
[15] "Dental.Insurance"
names(rfdata[,-1])[vsurf.mod$varselect.interp] [1] "Ethnicity" "English.Speaking"
[3] "Religion" "Age"
[5] "English.Difficulties" "Duration.of.Residency"
[7] "Familiarity.with.America" "Familiarity.with.Ethnic.Origin"
[9] "Belonging" "Full.Time.Employment"
[11] "Discrimination" "Identify.Ethnically"
[13] "Gender" "Income_median"
[15] "Primary.Language" "Dental.Insurance"
plot(vsurf.mod)
vsurf.mod$mean.perf[1] 0.1121726
Importance
vi<- data.frame(Variable=names(rfdata[,-1])[vsurf.mod$imp.mean.dec.ind],
Importance = vsurf.mod$imp.mean.dec,
sd_Importance = vsurf.mod$imp.sd.dec
)|>
mutate(fill = case_when(Variable=="Ethnicity"~"red",
.default="black"))
vi |> mutate(across(Importance:sd_Importance,~round(.x,5))) Variable Importance sd_Importance fill
1 Ethnicity 0.08673 0.00126 red
2 English.Speaking 0.07109 0.00168 black
3 Religion 0.06485 0.00114 black
4 Age 0.06303 0.00135 black
5 English.Difficulties 0.05479 0.00148 black
6 Duration.of.Residency 0.04271 0.00105 black
7 Familiarity.with.America 0.03900 0.00095 black
8 Familiarity.with.Ethnic.Origin 0.03833 0.00082 black
9 Belonging 0.03727 0.00086 black
10 Full.Time.Employment 0.03277 0.00130 black
11 Discrimination 0.03116 0.00083 black
12 Identify.Ethnically 0.02997 0.00084 black
13 Gender 0.02114 0.00075 black
14 Income_median 0.01866 0.00074 black
15 Primary.Language 0.01770 0.00091 black
16 Dental.Insurance 0.01726 0.00043 black
17 Health.Insurance 0.00745 0.00032 black
18 US.Born 0.00460 0.00027 black
importance_plot <- ggplot(vi, aes(x = reorder(Variable, Importance), y = Importance, fill=fill))+
geom_bar(stat = "identity",alpha=0.4) +
geom_errorbar(aes(ymin=Importance-sd_Importance, ymax = Importance+sd_Importance))+
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
scale_fill_manual(values=c("black","red"),
guide="none")
plot(importance_plot)
ggsave(filename = "VSURF_importance_Alt_ROSE.png", width=12, height=8,units="in")